
***************************************************
** ABSOLUTE ERROR & CORRELATIONS - ALL COUNTRIES **
***************************************************
use LONG_MI_NATURE_20180111.dta, clear
drop if daysbeforeED>7
drop if daysbeforeED==0
drop if countryid==20

gen year=yofd(elecdate)

** -> ABSOLUTE ERROR
gen mae = abs(vote_-poll_)

** -> IDENTIFIERS - PARTY*ELECTION
egen _couXlegXele_=concat(countryid election elecdate), decode p(" ")
egen _couXlegXparXele_=concat(countryid election elecdate partyid), decode p(" ")

collapse (mean) mae year, by(_couXlegXparXele_)

egen mean_mae = mean(mae), by(year)

* 
twoway /*
*/ (scatter mae year, mcolor(gs8) msymbol(Oh)) /*
*/ (scatter mean_mae year, mcolor(gs0) msymbol(O)) /*
*/ , scheme(plottig) /*
*/ graphregion(color(white)) /*
*/ title("(a) Absolute Error: All Parties/Candidates") /*
*/ ttitle("") /*
*/ ytitle("Absolute Error") /*
*/ ytick(0(5)15) /*
*/ ylabel(0(5)15) /*
*/ ylabel(,angle(horizontal)) /*
*/ xtick(1940(5)2020) /*
*/ xlabel(1940(10)2020) /*
*/ legend(off) /*
*/ saving(Fig2i.gph, replace)

* ALL POLLS, CORRELATION OF MAE & YEAR  
pwcorr mae year, sig obs

* ANNUAL AVERAGES, CORRELATION OF MAE & YEAR      
collapse (mean) mae, by(year)

sum mae
sum mae if year<1960
sum mae if year>1959 & year<1980
sum mae if year>1999

pwcorr mae year, sig obs

***********************************************************************
** ABSOLUTE ERROR & CORRELATIONS - COUNTRIES WITH CONTINUOUS POLLING **
***********************************************************************
use LONG_MI_NATURE_20180111.dta, clear
drop if daysbeforeED>7
drop if daysbeforeED==0
drop if countryid==20

gen year=yofd(elecdate)

keep if countryid==2 | countryid==6 | countryid==14 | countryid==15 | countryid==18 | countryid==23 | countryid==24 | countryid==25 | countryid==39 | countryid==40 | countryid==43  
keep if year>1976

** -> ABSOLUTE ERROR
gen mae = abs(vote_-poll_)

** -> IDENTIFIERS - PARTY*ELECTION
egen _couXlegXele_=concat(countryid election elecdate), decode p(" ")
egen _couXlegXparXele_=concat(countryid election elecdate partyid), decode p(" ")

collapse (mean) mae year, by(_couXlegXparXele_)

* ALL POLLS, CORRELATION OF MAE & YEAR  
pwcorr mae year, sig obs

* ANNUAL AVERAGES, CORRELATION OF MAE & YEAR      
collapse (mean) mae, by(year)

pwcorr mae year, sig obs

*******************************************
** ALTERNATIVE MEASURES OF POLL ACCURACY **
*******************************************
use LONG_MI_NATURE_20180111.dta, clear
drop if daysbeforeED>7
drop if daysbeforeED==0

* DROPPING JAPAN
drop if countryid==20

gen year=yofd(elecdate)

* MEASURES OF ERROR

** -> ABSOLUTE ERROR
gen mae = abs(vote_-poll_)
gen imae = abs(vote_-ipoll_)

** -> ERROR ON THE LEAD
egen _couXlegXeleXrnd_=concat(countryid election elecdate round), decode p(" ")
sort _couXlegXeleXrnd_ vote_

egen group=group(countryid election elecdate round vote_)
egen group2=group(countryid election elecdate round daysbeforeED)

by _couXlegXeleXrnd_: egen last=max(group)

gen rank=(last-group)+1

by _couXlegXeleXrnd_: gen vote1=vote_ if rank==1
by _couXlegXeleXrnd_: gen vote2=vote_ if rank==2

by _couXlegXeleXrnd_: egen vote1st=max(vote1) 
by _couXlegXeleXrnd_: egen vote2nd=max(vote2)
 
gen poll1=poll_ if rank==1
gen poll2=poll_ if rank==2

sort group2
by group2: egen poll1st=max(poll1) 
by group2: egen poll2nd=max(poll2)
 
gen winner=.
replace winner=1 if poll1st>poll2nd
replace winner=1 if poll1st==poll2nd
replace winner=0 if poll2nd>poll1st
replace winner=. if poll1st==. | poll2nd==.
  
gen votelead=vote1st-vote2nd
gen polllead=poll1st-poll2nd

gen abslead=abs(votelead-polllead)

** ODDS RATIO MEASURE OF POLLING BIAS (ARZHEIMER & EVANS)
gen A=log( (poll_/(100-poll_)*((100-vote_)/vote_)) )
gen Bi=abs(A)

egen _couXlegXeleXrndXday_=concat(countryid election elecdate round daysbeforeED), decode p(" ")
sort _couXlegXeleXrndXday_
by _couXlegXeleXrndXday_: egen B = mean(Bi)

* CALCULATE VARIANCE
egen sd_mae = sd(mae), by(year)
gen var_mae=sd_mae^2

* N of elections
egen _couXlegXele_=concat(countryid election elecdate), decode p(" ")
tab _couXlegXele_

* GRUBBS-STEFANSKY OUTLIER TEST (DROPS OBSERVATIONS FOR 1985 & 2004)
grubbs var_mae, gen(outlier)
replace var_mae=. if outlier==1

* COLLAPSE --> BY PARTY*ELECTION
egen _couXlegXparXele_=concat(countryid election elecdate partyid), decode p(" ")

collapse (mean) mae var_mae abslead Bi B year, by(_couXlegXparXele_)

** -> COLLAPSE FOR ANNUAL AVERAGES 
collapse (mean) mae abslead var_mae Bi, by(year)

reg mae year
reg var_mae year if year>1964

* LOWESS *

* 
twoway /*
*/ (scatter var_mae year if year>1940, mcolor(gs0) msymbol(O)) /*
*/ (lowess var_mae year if year>1940, bwidth(.3) clpattern(solid) clcolor(gs11) clwidth(thick)) /*
*/ , scheme(plottig) /*
*/ graphregion(color(white)) /*
*/ title("(e) Variance of Absolute Error") /*
*/ ttitle("") /*
*/ ytitle("Variance of Absolute Error") /*
*/ ytick(0(2)8) /*
*/ ylabel(0(2)8) /*
*/ ylabel(,angle(horizontal)) /*
*/ xtick(1940(5)2020) /*
*/ xlabel(1940(10)2020) /*
*/ legend(off) /*
*/ saving(lpFig2ii.gph, replace)

* 
twoway /*
*/ (scatter mae year if year>1940, mcolor(gs0) msymbol(O)) /*
*/ (lowess mae year if year>1940, bwidth(.3) clpattern(solid) clcolor(gs11) clwidth(thick)) /*
*/ , scheme(plottig) /*
*/ graphregion(color(white)) /*
*/ title("(b) Absolute Error") /*
*/ ttitle("") /*
*/ ytitle("Absolute Error") /*
*/ ytick(0(2)8) /*
*/ ylabel(0(2)8) /*
*/ ylabel(,angle(horizontal)) /*
*/ xtick(1940(5)2020) /*
*/ xlabel(1940(10)2020) /*
*/ legend(off) /*
*/ saving(lpFig2iv.gph, replace)

* 
twoway /*
*/ (scatter Bi year if year>1940, mcolor(gs0) msymbol(O)) /*
*/ (lowess Bi year if year>1940, bwidth(.3) clpattern(solid) clcolor(gs11) clwidth(thick)) /*
*/ , scheme(plottig) /*
*/ graphregion(color(white)) /*
*/ title("(d) Absolute Value of Log Odds Ratio") /*
*/ ttitle("") /*
*/ ytitle("Absolute Log Odds Ratio") /*
*/ ytick(0(.1).3) /*
*/ ylabel(0(.1).3) /*
*/ ylabel(,angle(horizontal)) /*
*/ xtick(1940(5)2020) /*
*/ xlabel(1940(10)2020) /*
*/ legend(off) /*
*/ saving(lpFig2v.gph, replace)


**************************************************************************
** ALTERNATIVE MEASURES OF POLL ACCURACY - ABSOLUTE ERROR ON THE MARGIN **
**************************************************************************
use LONG_MI_NATURE_20180111.dta, clear
drop if daysbeforeED>7
drop if daysbeforeED==0

* DROPPING JAPAN
drop if countryid==20

gen year=yofd(elecdate)

* MEASURES OF ERROR

** -> ABSOLUTE ERROR
gen mae = abs(vote_-poll_)
gen imae = abs(vote_-ipoll_)

** -> ERROR ON THE LEAD
egen _couXlegXeleXrnd_=concat(countryid election elecdate round), decode p(" ")
sort _couXlegXeleXrnd_ vote_

egen group=group(countryid election elecdate round vote_)
egen group2=group(countryid election elecdate round daysbeforeED)

by _couXlegXeleXrnd_: egen last=max(group)

gen rank=(last-group)+1

by _couXlegXeleXrnd_: gen vote1=vote_ if rank==1
by _couXlegXeleXrnd_: gen vote2=vote_ if rank==2

by _couXlegXeleXrnd_: egen vote1st=max(vote1) 
by _couXlegXeleXrnd_: egen vote2nd=max(vote2)
 
gen poll1=poll_ if rank==1
gen poll2=poll_ if rank==2

sort group2
by group2: egen poll1st=max(poll1) 
by group2: egen poll2nd=max(poll2)
 
gen winner=.
replace winner=1 if poll1st>poll2nd
replace winner=1 if poll1st==poll2nd
replace winner=0 if poll2nd>poll1st
replace winner=. if poll1st==. | poll2nd==.
  
gen votelead=vote1st-vote2nd
gen polllead=poll1st-poll2nd

gen abslead=abs(votelead-polllead)

** ODDS RATIO MEASURE OF POLLING BIAS (ARZHEIMER & EVANS)
gen A=log( (poll_/(100-poll_)*((100-vote_)/vote_)) )
gen Bi=abs(A)

egen _couXlegXeleXrndXday_=concat(countryid election elecdate round daysbeforeED), decode p(" ")
sort _couXlegXeleXrndXday_
by _couXlegXeleXrndXday_: egen B = mean(Bi)

* CALCULATE VARIANCE
egen sd_mae = sd(mae), by(year)
gen var_mae=sd_mae^2

* N of elections
egen _couXlegXele_=concat(countryid election elecdate), decode p(" ")
tab _couXlegXele_

* GRUBBS-STEFANSKY OUTLIER TEST (DROPS OBSERVATIONS FOR 1985 & 2004)
grubbs var_mae, gen(outlier)
replace var_mae=. if outlier==1

* COLLAPSE --> BY PARTY*ELECTION
egen _couXlegXparXele_=concat(countryid election elecdate partyid), decode p(" ")

collapse (mean) mae var_mae abslead Bi B year, by(_couXlegXele_)

** -> COLLAPSE FOR ANNUAL AVERAGES 
collapse (mean) abslead, by(year)

* LOWESS *

* 
twoway /*
*/ (scatter abslead year if year>1940, mcolor(gs0) msymbol(O)) /*
*/ (lowess abslead year if year>1940, bwidth(.3) clpattern(solid) clcolor(gs11) clwidth(thick)) /*
*/ , scheme(plottig) /*
*/ graphregion(color(white)) /*
*/ title("(c) Absolute Error on the Margin") /*
*/ ttitle("") /*
*/ ytitle("Absolute Error on the Margin") /*
*/ ytick(0(5)15) /*
*/ ylabel(0(5)15) /*
*/ ylabel(,angle(horizontal)) /*
*/ xtick(1940(5)2020) /*
*/ xlabel(1940(10)2020) /*
*/ legend(off) /*
*/ saving(lpFig2iii.gph, replace)

************************************************************************************
** VARIANCE OF ABSOLUTE ERROR - FOR COUNTRIES WHERE CONTINUOUS POLLING, 1977-2017 **
************************************************************************************
use LONG_MI_NATURE_20180111.dta, clear
drop if daysbeforeED>7
drop if daysbeforeED==0
drop if countryid==20
gen year=yofd(elecdate)

** -> RESTRICT TO COUNTRIES WITH CONTINUOUS POLLING, 1977 ONWARDS *******************************************************************
keep if countryid==2 | countryid==6 | countryid==14 | countryid==15 | countryid==18 | countryid==23 | countryid==24 | countryid==25 | countryid==39 | countryid==40 | countryid==43  
keep if year>1976

** -> ABSOLUTE ERROR
gen mae = abs(vote_-poll_)

* COLLAPSE --> BY PARTY*ELECTION
egen _couXlegXparXele_=concat(countryid election elecdate partyid), decode p(" ")
collapse (mean) mae year, by(_couXlegXparXele_)

egen sd_mae = sd(mae), by(year)
gen var_mae=sd_mae^2

* GRUBBS-STEFANSKY OUTLIER TEST (DROPS OBSERVATIONS FOR 1985 & 2004)
grubbs var_mae, gen(outlier)
replace var_mae=. if outlier==1

** -> COLLAPSE FOR ANNUAL MOVING AVERAGES 
collapse (mean) mae var_mae, by(year)

* TREND IN VARIANCE OF ABSOLUTE ERRORS
reg var_mae year if year>1976

* 
twoway /*
*/ (scatter var_mae year if year>1976, mcolor(gs0) msymbol(O)) /*
*/ (lowess var_mae year if year>1976, bwidth(.3) clpattern(solid) clcolor(gs11) clwidth(thick)) /*
*/ , scheme(plottig) /*
*/ graphregion(color(white)) /*
*/ title("(f) Variance of Absolute Error: Data Continuously Available") /*
*/ ttitle("") /*
*/ ytitle("Variance of Absolute Error") /*
*/ ytick(0(2)8) /*
*/ ylabel(0(2)8) /*
*/ ylabel(,angle(horizontal)) /*
*/ xtick(1940(5)2020) /*
*/ xlabel(1940(10)2020) /*
*/ legend(off) /*
*/ saving(lpFig2vi.gph, replace)

*******************************
** COMBINE GRAPHS FOR FIGURE **
*******************************

graph combine Fig2i.gph lpFig2iv.gph lpFig2iii.gph lpFig2v.gph lpFig2ii.gph lpFig2vi.gph, graphregion(color(white)) rows(3)

graph export PollError_Fig2.png, width(3000) replace
graph export PollError_Fig2.eps, replace
